========
========
This is a Principal Component Analysis conducted on the Summer of 2022 for Elizabeth’s EnviroDay 2025 poster. This file uses a series of functions that enable a more streamlined way to output our plots in a repetitive manner over longer date ranges. The script is organized as follows:
filepath is a link to our
data uploaded to GitHub. exportpath is a local directory
and needs to be updated to somewhere on your own machine.exportpath folder.pick_sites(cursite): given a site
(cursite: string), returns a df of data only for that site,
selected from the daily df.pick_dates(datemin, datemax, big_df): given a df
(big_df) and the min (datemin: string) and max
(datemax: string) dates of interest, returns a df with only
that date range.calc_pca(pca_df): given a df (pca_df),
returns TWO items in a list - (1) pca: a pca object; and
(2) loads: a df of the loadings from the pca.make_scree(pca): given a pca object (pca),
creates a scree plot.make_eigen: given a pca object (pca),
outputs the eigenvectors and eigenvalues.make_pca) to plot
the PCA given:
pca_df: a df with relevant dataszn: the season for which you want to plot the PCA, as
a stringyr: the year for which you want to plot the PCA, as a
stringcursite: the site for which you want to plot the PCA,
as a stringexp_pdf
to TRUE, it will override the setting in the SETUP section. Only use this for testing so that you
do not have to scroll all the way to the top. Set this setting to
FALSE before you are done.)========
========
filepath = "https://raw.githubusercontent.com/shabanm2/Utqiagvik/main/Analysis_Ready_Data/" # where daily avg data are located
exportpath = "/Users/emvanmetre/Desktop/Arctic/Utqiagvik/elizabeth seasonal analysis/PCA/" # where to export pdf files (if pdf is true)
years = c("2022", "2023")
seasons = c("Summer", "Fall", "Spring")
sites = c("TNHA", "BEO", "SSMH")
# put season values for the season that has the start of the data
date_start = "2022-06-01" # data starts in June 2022 (YEAR-MO-DY) where day is always 01
# put season values for the season that has the last of the data
date_end = "2023-11-01" # data ends after November of 2023 (will get data up UNTIL date_end not after)
scree = T # scree plot
eigen = T # eigenvectors and eigenvalues
pdf = F # export plots to pdf (in the export path specified above)
========
========
spring_months = c("March", "April", "May")
summer_months = c("June","July","August")
fall_months = c("September", "October", "November")
winter_months = c("December", "January", "February")
spring_dates = data.frame(months=spring_months, start=c("-03-01","-04-01","-05-01"), end=c("-04-01","-05-01","-06-01"))
summer_dates = data.frame(months=summer_months, start=c("-06-01","-07-01","-08-01"), end=c("-07-01","-08-01","-09-01"))
fall_dates = data.frame(months=fall_months, start=c("-09-01","-10-01","-11-01"), end=c("-10-01","-11-01","-12-01"))
winter_dates = data.frame(months=winter_months, start=c("-12-01","-01-01","-02-01"), end=c("-01-01","-02-01","-03-01"))
all_dates = data.frame(matrix(nrow = 0, ncol = 3))
for(yur in years){
# spring
if("Spring" %in% seasons){
curdate = as.Date(paste0(yur, spring_dates[1, 2]))
if(curdate >= as.Date(date_start) && curdate < as.Date(date_end)){
all_dates <- rbind(all_dates, (c("Spring", paste0(yur, spring_dates[1, 2]), paste0(yur, spring_dates[3, 3]))))
}
}
if("Summer" %in% seasons){
curdate = as.Date(paste0(yur, summer_dates[1, 2]))
if(curdate >= date_start && curdate < date_end){
all_dates <- rbind(all_dates, c("Summer", paste0(yur, summer_dates[1, 2]), paste0(yur, summer_dates[3, 3])))
}
}
if("Fall" %in% seasons){
curdate = as.Date(paste0(yur, fall_dates[1, 2]))
if(curdate >= date_start && curdate < date_end){
all_dates <- rbind(all_dates, c("Fall", paste0(yur, fall_dates[1, 2]), paste0(yur, fall_dates[3, 3])))
}
}
if("Winter" %in% seasons){
curdate = as.Date(paste0(yur, winter_dates[1, 2]))
if(curdate >= date_start && curdate < date_end){
all_dates <- rbind(all_dates, c("Winter", paste0(yur, winter_dates[1, 2]), paste0(yur, winter_dates[3, 3])))
}
}
}
colnames(all_dates) = c("szn","start","end")
library(dplyr)
library(lubridate)
library(tidyverse)
daily <- read.csv(paste0(filepath, "daily_2022_2024.csv"))
daily <- daily %>% select(-X) %>% select(-X.1) # get rid of index columns
daily$Date <- as.POSIXct(daily$date, format="%Y-%m-%d") # format dates
daily$fullname[daily$site == "BEO"] <- "BEO-BASE"
daily <- daily %>% filter(fullname == "TNHA-SA" | fullname == "TNHA-SC" | fullname == "SSMH-SB" | fullname == "SSMH-SA" | fullname == "BEO-BASE") %>% select(-c(winddir, date)) %>% mutate(aspect = case_when(fullname == "TNHA-SC" | fullname == "SSMH-SB" ~ "North", fullname == "TNHA-SA" | fullname == "SSMH-SA" ~ "South", .default = "N/A")) %>% filter(grounddepth == 8) %>% filter(Date >= "2022-06-19") %>% na.omit() %>% filter(windspeed >= 0) # create "aspect" column and filter for top depth of soil and start date of when we started collecting data
# note: the data before June 19, 2022 was estimated by our gap-filling script and should be disregarded due to extrapolation
=========================
=========================
Temporal Range: Season Vertical Spatial Range: 30-45 cm Horizontal Spatial Range: stations across site (TNHA, SSMH, BEO) –> Average Total Site –> North vs South (except for BEO)
pick_site <- function(cursite){
big_df <<- daily %>% filter(site == cursite)
if(szn == "Winter"){
big_df <<- big_df %>% select(-solar)
}
return(big_df)
}
pick_dates <- function(datemin, datemax, big_df){
pca_df <<- big_df %>% filter(Date >= datemin) %>% filter(Date < datemax)
# get rid of NAs
pca_df <<- na.omit(pca_df)
pca_df <<- unique(pca_df)
return(pca_df)
}
calc_pca <- function(pca_df){
pca <<- prcomp(pca_df[,6:(ncol(pca_df)-2)], center=TRUE, scale.=TRUE)
#take out variables
sd <- pca$sdev
loads <<- pca$rotation
rownames(loads) <<- colnames(pca_df[6:(ncol(pca_df)-2)])
scores <<- pca$x
var <- sd^2
varPercent <- var/sum(var) * 100
return(list("pca"=pca, "loads"=loads))
}
make_scree <- function(pca){
sd <- pca$sdev
var <- sd^2
varPercent <- var/sum(var) * 100
barplot(varPercent, xlab="PC", ylab="Percent Variance", names.arg=1:length(varPercent),
las=1, ylim=c(0, max(varPercent)), col="gray")
abline(h=1/ncol(pca_df[5:ncol(pca_df)])*100, col="red")
}
make_eigen <- function(pca){
eigenvectors <- pca$rotation
print("Eigenvectors (Loadings):")
print(eigenvectors)
print("Loadings Cutoff:")
sqrt(1/ncol(pca_df[5:ncol(pca_df)])) # cutoff for "important" loadings
# Access the eigenvalues (variances of the principal components)
eigenvalues <- (pca$sdev)^2
print("Eigenvalues:")
print(eigenvalues)
}
===============
===============
plotcounter <<- 0
make_pca <- function(pca_df, szn, yr, cursite){
if (pdf | exp_pdf) {
pdf(file = paste0(exportpath, "pca_", plotcounter, ".pdf"), # The directory you want to save the file in
width = 8, # The width of the plot in inches
height = 8) # The height of the plot in inches
plotcounter <<- plotcounter + 1
}
if (cursite == "BEO")
{
SOUTH <<- pca_df$site == cursite
# n <- "BEO"
} else {
SOUTH <<- pca_df$site == cursite & pca_df$aspect == "South"
NORTH <<- pca_df$site == cursite & pca_df$aspect == "North"
s <- pca_df$fullname[SOUTH][1]
n <- pca_df$fullname[NORTH][1]
}
xmin = floor(min(scores[,1])*limNudge)
xmax = ceiling(max(scores[,1])*limNudge)
ymin = floor(min(scores[,2])*limNudge)
ymax = ceiling(max(scores[,2])*limNudge)
xlimit <- seq(xmin, xmax, 1)
ylimit <- seq(ymin, ymax, 1)
plot(scores[, 1], scores[, 2], xlab="Principal Component 1", ylab="Principal Component 2", type="n", asp=1,
las=1, xaxt='n', yaxt='n')
axis(side = 1, at=xlimit)
axis(side = 2, at=ylimit)
if (cursite == "BEO")
{
nvstext <- " "
} else {
nvstext <- " North v. South "
}
mindate = format(as.Date(min(pca_df$Date)), format="%B %d %Y")
maxdate = format(as.Date(max(pca_df$Date)), format="%B %d %Y")
title(paste0(szn, " ", yr," Principal Component Analysis:\n", site, nvstext, "\n(", mindate," - ", maxdate, ")"), adj=0.5)
points(scores[SOUTH, 1], scores[SOUTH, 2], pch=16, cex=1, col="mediumturquoise")
if(cursite != "BEO"){
points(scores[NORTH, 1], scores[NORTH, 2], pch=16, cex=1, col="salmon")
legend(x = "topright", # Position
legend = c(paste0(s, " (south)"), paste0(n, " (north)")), # Legend texts
col = c("mediumturquoise","salmon"),
pch = 19) #colors
} else{
legend(x = "topright", # Position
legend = "BEO", # Legend texts
col = "mediumturquoise",
pch = 19)
}
arrows(0, 0, loads[, 1]* scaling, loads[, 2]* scaling, length=0.1, angle=30, col="darkred", lwd=2)
arr1x = loads[1, 1]*scaling*textNudge
arr1y = loads[1, 2]*scaling*textNudge
if(arr1y < ymax / 2 & arr1y > ymin / 2)
{
arr1x = loads[1, 1]*scaling*(textNudge+(textAdj * length(rownames(loads)[1]))+0.3)
}
arr2x = loads[2, 1]*scaling*textNudge
arr2y = loads[2, 2]*scaling*textNudge
if(arr2y < ymax / 2 & arr2y > ymin / 2)
{
arr2x = loads[2, 1]*scaling*(textNudge+(textAdj * length(rownames(loads)[2])))
}
# check if any text is overlapping groundtemp text (arr1y)
arr_overlap = data.frame(arrow=1, x=arr1x, y=arr1y)
if(abs(arr1x-arr2x) < 1 & abs(arr1y-arr2y) < 0.8) {
arr_overlap = rbind(arr_overlap, data.frame(arrow=2, x=arr2x, y=arr2y))
}
if(nrow(loads) > 2){
arr3x = loads[3, 1]*scaling*(textNudge+0.2)
arr3y = loads[3, 2]*scaling*textNudge
if(arr3y < ymax / 2 & arr3y > ymin / 2)
{
arr3x = loads[3, 1]*scaling*(textNudge+(textAdj * length(rownames(loads)[3])))
}
if(abs(arr1x-arr3x) < 1 & abs(arr1y-arr3y) < 0.8) {
arr_overlap = rbind(arr_overlap, data.frame(arrow=3, x=arr3x, y=arr3y))
}
if(nrow(loads) > 3){
arr4x = loads[4, 1]*scaling*textNudge-0.2
arr4y = loads[4, 2]*scaling*textNudge
if(arr4y < ymax / 2 & arr4y > ymin / 2)
{
arr4x = loads[4, 1]*scaling*(textNudge+(textAdj * length(rownames(loads)[4])))
}
if(abs(arr1x-arr4x) < 1 & abs(arr1y-arr4y) < 0.8) {
arr_overlap = rbind(arr_overlap, data.frame(arrow=4, x=arr4x, y=arr4y))
}
if(nrow(loads) > 4){
arr5x = loads[5, 1]*scaling*textNudge
arr5y = loads[5, 2]*scaling*textNudge
if(arr5y < ymax / 2 & arr5y > ymin / 2)
{
arr5x = loads[5, 1]*scaling*(textNudge+(textAdj * length(rownames(loads)[5])))
}
if(abs(arr1x-arr5x) < 1 & abs(arr1y-arr5y) < 0.8) {
arr_overlap = rbind(arr_overlap, data.frame(arrow=5, x=arr5x, y=arr5y))
}
}
}
}
arr_overlap = arr_overlap[order(arr_overlap$y, decreasing=F),]
arr1y = arr_overlap$y[floor(length(arr_overlap$arrow)/2)] + ((which(arr_overlap$arrow == 1) - floor(length(arr_overlap$arrow)/2))*0.3)
if(2 %in% arr_overlap$arrow) {
arr2y = arr_overlap$y[floor(length(arr_overlap$arrow)/2)] + ((which(arr_overlap$arrow == 2) - floor(length(arr_overlap$arrow)/2))*0.3)
}
if(3 %in% arr_overlap$arrow) {
arr3y = arr_overlap$y[floor(length(arr_overlap$arrow)/2)] + ((which(arr_overlap$arrow == 3) - floor(length(arr_overlap$arrow)/2))*0.3)
}
if(4 %in% arr_overlap$arrow) {
arr4y = arr_overlap$y[floor(length(arr_overlap$arrow)/2)] +((which(arr_overlap$arrow == 4) - floor(length(arr_overlap$arrow)/2))*0.3)
}
if(5 %in% arr_overlap$arrow) {
arr5y = arr_overlap$y[floor(length(arr_overlap$arrow)/2)] + ((which(arr_overlap$arrow == 5) - floor(length(arr_overlap$arrow)/2))*0.3)
}
text(arr1x, arr1y, rownames(loads)[1], col="darkred", cex=1) # ground label
text(arr2x, arr2y, rownames(loads)[2], col="darkred", cex=1) # vwc label
if(nrow(loads) > 2){
text(arr3x, arr3y, rownames(loads)[3], col="darkred", cex=1) # airtemp label
if(nrow(loads)>3){
text(arr4x, arr4y, rownames(loads)[4], col="darkred", cex=1) # solar or wind label
if(nrow(loads)>4){
text(arr5x, arr5y, rownames(loads)[5], col="darkred", cex=1) # solar or wind label
}
}
}
if(pdf | exp_pdf) {
dev.off()
}
}
Adjust Settings for PCA Plots
# settings for pca plots
scaling <<- 2 # scaling the arrows
textNudge <<- 1.2 # nudge the arrow labels (added)
textAdj <<- 0.3 # adjust the text position based on the length of the label (multiplied)
limNudge <<- 1.3 # nudge the plot limits
exp_pdf <<- F
pca_output = list()
loads_output = data.frame(matrix(nrow = 0, ncol = 5))
colnames(loads_output) = c("PC1", "PC2", "PC3", "PC4", "PC5")
for(i in c(1:nrow(all_dates))){
# month <- all_dates$months[i]
startdate <- all_dates$start[i]
enddate <- all_dates$end[i]
szn <<- all_dates$szn[i]
yr <<- substr(all_dates$start[i], 1, 4)
sitecounter = 0
for(site in sites){
sitecounter = sitecounter + 1
big_df <- pick_site(site)
pca_df <- pick_dates(startdate, enddate, big_df)
if(nrow(pca_df) > 4){
p <- calc_pca(pca_df)
pca <- p$pca
loads <- p$loads
pca_output[((3*(i-1)) + sitecounter)] = p
loads_output = rbind(loads_output, loads)
if(scree == T){
make_scree(pca)
}
if(eigen == T){
make_eigen(pca)
}
make_pca(pca_df, szn, yr, site)
}
}
}
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp -0.59316722 0.01371483 -0.1436874 -0.2447185 -0.7532804
## vwc 0.09411508 -0.76660761 -0.5972068 0.2120092 -0.0430270
## airtemp -0.51334693 0.24874263 -0.5258625 -0.2351603 0.5854658
## solar -0.51068827 -0.01442696 0.2242964 0.8248497 0.0911229
## windspeed 0.33906846 0.59164486 -0.5439281 0.3993588 -0.2822122
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 2.5176450 1.2045979 0.6988812 0.4511032 0.1277727
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp 0.5279039 -0.4245747 0.01650233 0.3356175 0.6543259
## vwc -0.4510438 -0.3358896 0.53025537 -0.5008702 0.3894816
## airtemp 0.4697885 -0.5543090 0.18563124 -0.3094427 -0.5846593
## solar 0.4668158 0.4392134 -0.17999285 -0.6957082 0.2697542
## windspeed -0.2815292 -0.4546951 -0.80727971 -0.2381707 0.0746182
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 2.5099966 1.0679698 0.8515461 0.4326108 0.1378767
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp 0.61846993 0.02296693 -0.10610445 -0.30780331 -0.71481916
## vwc -0.19617546 -0.68832143 0.61506200 0.08491987 -0.31971259
## airtemp 0.56349748 -0.23602296 0.29051147 -0.40853252 0.61275446
## solar 0.50913593 -0.09530177 -0.02335743 0.85185760 0.07410295
## windspeed 0.04755348 0.67889385 0.72491248 0.07406452 -0.07653857
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 2.2292796 1.2670170 0.7743745 0.5640230 0.1653059
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp -0.5188334 -0.2591134 0.09953072 0.1962793 0.78437250
## vwc -0.5006078 0.1196819 0.14142273 -0.8397538 -0.09940486
## airtemp -0.4982153 -0.2381011 0.45510722 0.4069253 -0.56778393
## solar -0.4684748 0.1955643 -0.81831163 0.1917578 -0.18942272
## windspeed 0.1118777 -0.9075235 -0.30551064 -0.2322257 -0.12891432
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 3.26823358 1.10983045 0.34361118 0.22354873 0.05477606
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp -0.55461873 0.09830316 0.09683303 0.3870113 0.723588376
## vwc -0.41997252 0.04379749 -0.82032384 -0.3855291 -0.011873441
## airtemp -0.55037091 0.10103335 0.08434257 0.4527768 -0.689030964
## solar -0.46040190 -0.20814388 0.54072180 -0.6714872 -0.037829357
## windspeed 0.03380975 0.96689537 0.13490152 -0.2137467 -0.009173527
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 3.02682023 1.03963544 0.62588289 0.28907360 0.01858783
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp -0.6181861 -0.13045621 -0.14289850 0.01012654 0.76177725
## vwc -0.4260135 0.54466973 0.06872628 0.67480714 -0.24851444
## airtemp -0.6217652 0.04380794 -0.01706140 -0.60741036 -0.49218942
## solar -0.1353901 -0.65122441 -0.55477920 0.37496843 -0.33044669
## windspeed -0.1773019 -0.51019966 0.81656945 0.18704042 -0.08056399
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 2.36023181 1.64070579 0.73832022 0.16594468 0.09479749
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp 0.5496265 0.3360194 -0.01718172 0.6520608 0.39940349
## vwc 0.2446729 -0.6179679 -0.48629346 0.3652437 -0.43401216
## airtemp 0.5681201 0.3838985 0.17480534 -0.2728446 -0.65181313
## solar 0.5492348 -0.3558749 -0.12690121 -0.5740995 0.47539468
## windspeed -0.1167330 0.4808136 -0.84650023 -0.1933372 0.03535293
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 2.0075407 1.5564325 0.8311946 0.4557631 0.1490691
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp 0.5934680 0.07729637 -0.09058575 0.35885504 0.71051970
## vwc 0.1033732 -0.71068408 0.69081142 0.07215989 0.04259889
## airtemp 0.5941810 0.02584992 -0.05944543 0.38698065 -0.70213453
## solar 0.5041789 -0.20473033 -0.20017291 -0.81464897 -0.01292165
## windspeed 0.1728068 0.66810979 0.68627316 -0.22936661 -0.01368274
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 2.62439467 1.06704921 0.90313709 0.37967322 0.02574581
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp 0.56178673 0.13537319 0.05309631 0.6289212 0.51740574
## vwc 0.53107710 -0.08022778 -0.45569081 -0.6511497 0.28261376
## airtemp 0.57291503 0.14006036 -0.01943572 0.1169098 -0.79881525
## solar -0.05770872 -0.77528516 -0.50721311 0.3543158 -0.11312735
## windspeed -0.26606551 0.59544405 -0.72930363 0.2031122 -0.03895076
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 2.7523521 1.3493338 0.5989767 0.1939568 0.1053806
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp 0.4507066 -0.17125480 -0.6997813 -0.26683118 -0.45457959
## vwc 0.4615618 0.31176078 0.2223796 0.68915128 -0.40667397
## airtemp 0.5556088 0.02114495 -0.2172939 0.15315025 0.78751517
## solar -0.4012250 -0.47964541 -0.4238519 0.65315113 0.05198062
## windspeed -0.3378485 0.80185421 -0.4837214 0.06172708 0.07135506
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 2.7714850 0.8058638 0.6946370 0.5701438 0.1578703
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp -0.6526614 0.11344882 -0.103893874 -0.2117379 0.71101024
## vwc 0.2964719 0.66244527 -0.007204097 -0.6867948 -0.03913699
## airtemp -0.6108718 0.09709442 -0.388361132 -0.1277341 -0.67102011
## solar -0.2896022 -0.19861904 0.858572186 -0.3140736 -0.20221925
## windspeed 0.1705982 -0.70669568 -0.318093691 -0.6070599 0.04209654
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 1.9829288 1.2671797 0.9754763 0.5708566 0.2035586
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp -0.52551892 -0.02879635 0.5076012 -0.1908944 -0.65490533
## vwc -0.41810219 0.29990166 -0.7359578 -0.4218452 -0.12514885
## airtemp -0.55489551 -0.03413200 0.3004256 -0.2189802 0.74345014
## solar -0.49084463 -0.21164278 -0.2618377 0.8028564 -0.03378692
## windspeed -0.01352553 0.92912438 0.2046767 0.3050824 0.03971283
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 2.90961305 1.09293790 0.61825528 0.32204566 0.05714811
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp -0.5696368 0.03900851 -0.2673704 -0.37120162 -0.681699821
## vwc -0.3012312 0.23124643 0.9207486 -0.01446982 -0.088303835
## airtemp -0.5739028 0.17454938 -0.1672462 -0.30428850 0.720836546
## solar -0.5045420 -0.32886187 -0.0702316 0.79520289 -0.002677829
## windspeed 0.0290973 0.89798987 -0.2187039 0.37021731 -0.088742834
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 2.6550826 1.1347195 0.8230307 0.3120675 0.0750997
## Warning in pca_output[((3 * (i - 1)) + sitecounter)] <- p: number of items to
## replace is not a multiple of replacement length
## [1] "Eigenvectors (Loadings):"
## PC1 PC2 PC3 PC4 PC5
## groundtemp -0.58129743 0.007938726 0.06743657 0.50268525 0.63623119
## vwc -0.05548794 -0.909198966 -0.36066231 -0.15786807 0.12360705
## airtemp -0.59110050 -0.139430942 0.07386334 0.25602246 -0.74843565
## solar -0.53187523 0.127079130 0.16244094 -0.81010192 0.13530502
## windspeed 0.16347269 -0.371085062 0.91297982 0.02392892 0.03831195
## [1] "Loadings Cutoff:"
## [1] "Eigenvalues:"
## [1] 2.61274379 1.04014707 0.94375654 0.33198573 0.07136687
===============
===============
Principle component analysis is an analysis technique that uses linear algebra to reduce the dimensionality of a large data set to a smaller one that still contains most of the information in the original data set. While we do sacrifice some accuracy, it makes the data easier to visualize and allows us to understand the relationship between variables within our data.
The principal components are new variables that are uncorrelated linear combinations of the initial variables. When we reduce the dimensionality of the data into a number of principal components, we aim to put as much information as possible in the first component, then as much of the remaining information as possible into the next component, and so on for all the principal components (the number of variables we are examining in the data).
The amount of information stored in each principal component is stored can be visualized with a scree plot, like the following:
Covariance is a statistical measure of the directional relationship between two variables, or vectors. It measures how the mean values of the two variables relate. It is similar to variance, as both look at the distribution of points in a data set. However, covariance is different from the correlation coefficient — the correlation coefficient measures the strength of a relationship between two variables whereas the covariance only measures the direction of the relationship (as positive or negative).
Positive Covariance: a value for variable A that is higher than A’s mean corresponds with the same point in variable B that is also higher than B’s mean.
Negative Covariance: a value for variable A that is higher than A’s mean corresponds with the same point in variable B that is lower than B’s mean (or vice versa).
PCA calculates a covariance matrix of the variance of the vectors and the covariance between the vectors, \(v_1, v_2, \ldots , v_n\). \(Var(v_1)\) is the variance of the first vector, \(v_1\) and \(Cov(v_1, v_n)\) is the covariance of the first vector, \(v_1\), and the last vector, \(v_n\).
\[ \begin{equation} \begin{pmatrix} Var(x_1) & \cdots & Cov(x_1, x_n) \\ \vdots & \ddots & \vdots \\ Cov(x_n, x_1) & \cdots & Var(x_n) \end{pmatrix}\end{equation} \]
For example, if we take our five variables: groundtemp,
airtemp, solar, vwc, and
windspeed, we can make a matrix like this:
\[ \begin{equation} \begin{pmatrix} Var(groundtemp) & Cov(groundtemp, airtemp) & Cov(groundtemp, solar) & Cov(groundtemp, vwc) & Cov(groundtemp, windspeed) \\ Cov(groundtemp, airtemp) & Var(airtemp) & Cov(airtemp, solar) & Cov(airtemp, vwc) & Cov(airtemp, windspeed) \\ Cov(groundtemp, solar) & Cov(airtemp, solar) & Var(solar) & Cov(solar, vwc) & Cov(solar, windspeed) \\ Cov(groundtemp, vwc) & Cov(airtemp, vwc) & Cov(solar, vwc) & Var(vwc) & Cov(vwc, windspeed) \\ Cov(groundtemp, windspeed) & Cov(airtemp, windspeed) & Cov(solar, windspeed) & Cov(vwc, windspeed) & Var(windspeed) \end{pmatrix}\end{equation} \]
Eigenvectors and eigenvalues are computed from the covariance matrix to determine the principle components of our data. They come in pairs, such that each eigenvector has a corresponding eigenvalue. The number of dimensions in the data, or the number of variables, determines the number of pairs. For example, we have five variables and therefore we look at 5 eigenvector/eigenvalue pairs and thus 5 principle components.
Loadings (eigenvectors) of the principal components are the amount of variance captured for each variable Scalars (eigenvalues) are the leftover scalars after dimensionality reduction
\(A \vec{v} = \lambda \vec{v}\) where \(\vec{v}\) is an eigenvector and \(\lambda\) is our eigenvalue scalar.
If we set our equation equal to zero such that \(A\vec{v} - \lambda \vec{v} = 0\) and solve for \(\lambda\), we can get our eigenvectors. Note that this is a complex matrix operation so R does this for us!
TNHA South has more variation along PC1
TNHA North has more variation along PC2 than south, but generally has variance on both PCs
i = 1 # which loads you want to see
loads_output[c(1 + (5 * (i-1)):(5*i)-1),]
## PC1 PC2 PC3 PC4 PC5
## groundtemp -0.59316722 0.01371483 -0.1436874 -0.2447185 -0.7532804
## vwc 0.09411508 -0.76660761 -0.5972068 0.2120092 -0.0430270
## airtemp -0.51334693 0.24874263 -0.5258625 -0.2351603 0.5854658
## solar -0.51068827 -0.01442696 0.2242964 0.8248497 0.0911229
## windspeed 0.33906846 0.59164486 -0.5439281 0.3993588 -0.2822122